require(tidyverse)
require(knitr)
require(DiagrammeR)
require(geosphere)
require(ade4)
require(adespatial)
require(vcfR)
require(plotly)
require(vegan)
require(adegenet)

#HDplot
#library(vcfR)
#library(ggplot2)
#library(dplyr)
#library(stringr)

#HDplot function
HDplot<-function(vcfData){
  #set up results table
  HDplotTable<-as.data.frame(matrix(NA,nrow=dim(vcfData@gt)[1],ncol=13))
  colnames(HDplotTable)<-c("CHROM","POS","ID","depth_a","depth_b","ratio","num_hets","num_samples","num_called","H_all","H","std","D")
  
  #get genotypes from vcf file
  genos<-extract.gt(vcfData, element = "GT", mask = FALSE, as.numeric = FALSE, return.alleles = FALSE, 
                    IDtoRowNames = TRUE, extract = TRUE, convertNA = FALSE)
  
  #replace NA genotypes with ./.
  genos[is.na(genos)]<-"./."
  
  #get allele reads from vcf file
  reads<-extract.gt(vcfData, element = "AD", mask = FALSE, as.numeric = FALSE, return.alleles = FALSE, 
                    IDtoRowNames = TRUE, extract = TRUE, convertNA = FALSE)
    
  #replace reads for samples with missing data with 0
  #reads<-gsub("\\.,\\.","0,0",reads)
  reads[grepl("\\.",reads)]<-"0,0"
  reads[is.na(reads)]<-"0,0"
  
  alleleReads<-apply(reads,2,function(x) str_split_fixed(x,",",2))
  alleleReads_1<-alleleReads[1:dim(reads)[1],]
  alleleReads_2<-alleleReads[dim(reads)[1]+1:dim(reads)[1],]
  #convert to numeric format
  alleleReads_1<-apply(alleleReads_1,2, function(x) as.numeric(x))
  alleleReads_2<-apply(alleleReads_2,2, function(x) as.numeric(x))
  #subset to heterozygous genotypes
  #make genotype matrix where heterozygotes are 1 and other genotypes are 0
  hetMatrix<-apply(genos,2,function(x) dplyr::recode(x,'0/0'=0,'1/1'=0,'./.'=0,'0/1'=1,'1/0'=1))
  calledGenos<-apply(genos,2,function(x) dplyr::recode(x,'0/0'=1,'1/1'=1,'0/1'=1,'1/0'=1,.default=NA_real_))
  #multiply read count matrices by heterozygote matrix to get read counts for heterozygous genotypes
  alleleReads_1_het<-alleleReads_1*hetMatrix
  alleleReads_2_het<-alleleReads_2*hetMatrix
  #rows are loci and columns are samples
  #sum reads per allele per locus for heterozygous samples
  A_reads<-apply(alleleReads_1_het,1,sum)
  B_reads<-apply(alleleReads_2_het,1,sum)
  totalReads<-A_reads+B_reads
  ratio<-A_reads/totalReads
  std<-sqrt(totalReads*0.5*0.5)
  z<- -(totalReads/2-A_reads)/std
  #get percent heterozygosity for each locus
  numHets<-apply(hetMatrix,1,sum)
  hetPerc<-numHets/dim(hetMatrix)[2]
  
  numGenos<-apply(calledGenos,1,sum,na.rm=TRUE)
  H<-numHets/numGenos
  
  #assign results to HDplotTable
  HDplotTable$CHROM<-vcfData@fix[,"CHROM"]
  HDplotTable$POS<-vcfData@fix[,"POS"]
  HDplotTable$ID<-vcfData@fix[,"ID"]
  HDplotTable$depth_a<-A_reads
  HDplotTable$depth_b<-B_reads
  HDplotTable$ratio<-ratio
  HDplotTable$num_hets<-numHets
  HDplotTable$num_samples<-dim(hetMatrix)[2]
  HDplotTable$num_called<-numGenos
  HDplotTable$H_all<-hetPerc
  HDplotTable$H<-H
  HDplotTable$std<-std
  HDplotTable$D<-z

  return(HDplotTable)
}

Readme

This is an rstudio project. If you’d like to pre-rendered figures, read a summary of analysis and view code, please open the relevant html file in a browser.

There are a lot of analyses in this notebook because I was curious about how filtering would impact the results. In the end, we used the same SNP dataset from the Vaux et al ms for consistency and ease of panel development (sections Vaux RDA and Vaux SNP RDA, below), other analyses are included in the notebook as a log, but not evaluated/rendered.

Rationale

We use ddRADseq data from 12 sampling locations across the Pacific Ocean to develop a GTseq marker panel for genetic stock identification and population genetic structure of Pacific Albacore.

Previous analyses used STRUCTURE and PCA to reveal population genetic structure that separates North from South Pacific sampling locations and outlier approaches identified a set of SNP markers that are strongly differentiated these two groups. However, no genetic structure was identified at smaller spatial scales using these tools, despite some potential evidence of restricted dispersal at a sub-gyre scale from stable isotope analysis and region specific growth rates and size.

Given the large effective population size (Ne) of Albacore, we suggest caution in interpreting the PCA and STRUCTURE results as evidence of panmixia within North and South Pacific Albacore populations. Fst at sub-gyre spatial scales was substantially below the threshold at which these tools can infer genetic structure, and large Ne can result in low Fst despite substantial demographic indepedence among demes.

In this notebook, we select additional features for a future GTseq panel after investigating potential genetic structure across a diversity of spatial scales available in the sampled locations. Identification of potential fine scale genetic structure is accomplished through:

  1. Spatially Explicit Analysis: We will decompose spatial structure among the sampling locations into the full range of scales to attempt to identify the smallest spatial structures that show evidence of population genetic structure in the ddRADseq data.

Additional approaches explored but not fully implemented:

  1. Hierarchical Structure: While signficant Fst between pairwise sampling locales with the North Pacific were low and mostly non-significant, stronger patterns may be observed at increasing levels of hierarchical structure.

  2. Statistical Learning: Subtle polygenic structure among sampling locations may reveal population structure that univariate approaches may miss. Methods like random forest and other statistical learning algorithms have demonstrated greater sensitivity to subtle population genetic structure than methods like PCA and STRUCTURE.

Data

Vaux Genetic Data

Which is the correct stacks catalog?

To overcome filtering steps that require a priori knowledge of population structure, Vaux used 9 different stacks runs to conduct the analyses in the ms. As a consequence we must be extra cautious that we build the input file for from the correct data.

Here, used the appendix from the ms (table A11) to verify which of the 9 different stacks runs is the catalog used to generate the SNP dataset used for FST outlier analysis in Vaux ms

The correct stacks catalog is ori-pac-1 ( full directory location: /dfs/Omalley_Lab/vauxf/STACKS/stacks_out/albacore/ori-pac-1/catalog.fa.gz )

Within each of the 9 stacks catalogs, there are many filtering steps. According to table A4, the final set used -p 2 and -r 0.95, however, three different final filtered vcf files match these parameters.

Below are the actual calls to populations used in each of these three directories:

ori-pac-1-p2-r95-a
populations -t 25 -P /nfs1/FW_HMSC/OMalley_Lab/vauxf/STACKS/stacks_out/albacore/ori-pac-1 -M /nfs1/FW_HMSC/OMalley_Lab/vauxf/STACKS/population_maps/albacore/12-2P-bam1.txt -r 0.95 -p 2 –min_maf 0.05 –write_single_snp –structure –fstats –fst_correction bonferroni_win –genepop –plink –vcf –hzar –fasta_samples –fasta_loci

Summary: correct stacks catalog, maf filtering, p 2 and r 0.95, no other filtering

ori-pac-1-p2-r95-a2
populations -t 30 -P /nfs1/FW_HMSC/OMalley_Lab/vauxf/STACKS/stacks_out/albacore/ori-pac-1 -M /nfs1/FW_HMSC/OMalley_Lab/vauxf/STACKS/population_maps/albacore/12-2P-bam1.txt -B /nfs1/FW_HMSC/OMalley_Lab/vauxf/STACKS/blacklists/albacore/ori-pac-1-p2-r90-PSV-LD-C1.blacklist -r 0.95 -p 2 –min_maf 0.05 –write_single_snp –structure –fstats –fst_correction bonferroni_win –genepop –plink –vcf –hzar –fasta_samples –fasta_loci

Summary: correct stacks catalog, maf filtering, p 2 and r 0.95, PSV and LD filtering using the C1 blacklist (971 markers)

ori-pac-1-p2-r95-a3
populations -t 30 -P /nfs1/FW_HMSC/OMalley_Lab/vauxf/STACKS/stacks_out/albacore/ori-pac-1 -M /nfs1/FW_HMSC/OMalley_Lab/vauxf/STACKS/population_maps/albacore/12-2P-bam1.txt -B /nfs1/FW_HMSC/OMalley_Lab/vauxf/STACKS/blacklists/albacore/ori-pac-1-p2-r90-PSV-LD-C2.blacklist -r 0.95 -p 2 –min_maf 0.05 –write_single_snp –structure –fstats –fst_correction bonferroni_win –genepop –plink –vcf –hzar –fasta_samples –fasta_loci

Summary: correct stacks catalog, maf filtering, p 2 and r 0.95, PSV and LD filtering using the C2 blacklist (1490 markers)

Correct Final Filtered Dataset

Using the information in table 2 and table a4, ori-pac-1-p2-r95-a3 is a perfect match to the manuscript. The correct vcf file is at /dfs/Omalley_Lab/vauxf/STACKS/stacks_out/albacore/ori-pac-1/ori-pac-1-p2-r95-a3/ori-pac-1-p2-r95-a3.vcf and locally at ./input_data/vaux_data/ori-pac-1-p2-r95-a3.vcf

This was confirmed with Felix.

vcf_vaux <- read.vcfR("input_data/vaux_data/ori-pac-1-p2-r95-a3.vcf")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 12872
##   column count: 317
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 12872
##   Character matrix gt cols: 317
##   skip: 0
##   nrows: 12872
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant: 12872
## All variants processed

Dayan STACKS Runs

There are a lot of different versions of the SNP dataset used in the Vaux manuscript.

Here I run stacks populations on the data with my own filtering. I tried to follow Felix’s approach as close as possible to minimize differences with the following exceptions.

-individual filtering: not clear what “failed to sequence” in the ms means, took a middle of the road approach and removed individuals with poor coverage in the bam files
- MAF filtering: got rid of MAF 5% filtering (will not have desired effect given other changes I’m making)
- MAC filtering: because I got rid of MAF filtering, still wanted to filter singletons, set min-mac to 3 (allele present in at least 2 individuals) - r: kept the same as the final Vaux dataset for consistency (SNP must be present in 90% of samples)
- p /popmap: set to 1 because ran populations as if pacific was panmictic. Since we don’t know how population structure relates to sampling structure yet and some site sample sizes are quite small I didn’t want to get rid of too many sites with r: 0.9 , note this didnt work because there was extreme variation in missingness among sampling sites and those with hihgh missingness happened to be on the extremes of the range. when we imputed missing data to the mean this effectively broght the edge populations closer to the center and wiped out any population structure we observed in the smaller dataset
- did NOT LD prune the dataset.
- did not do HWE filtering (already doing PSV filtering with HDplot and PARALOG-FINDER 1.0)

Filtering Summary:
run 1:
Populations: -r 0.9 -p 1 –min_mac 3 Coverage: individual GT set to missing if depth per individual per SNP was less than 10, then applied a missingness filter again (max missingness at a locus (across all samples) 20%), then excluded coverage outliers

this produce a ton of missing data in the BA and BC samples, which wipes out any N-PAC latitudinal variation in PCA once we impute missing data, so skipped this

run 2: this runs final name is dayan2.snps.vcf Populations: -r 0.9 -p 12 –min_mac 3 (with 12 population popmap) Coverage: individual GT set to missing if depth per individual per SNP was less than 10, then applied a missingness filter again (max missingness at a locus (across all samples) 20%), then excluded coverage outliers

run 3:
this runs final name is dayan3.snps.vcf
Populations: -r 0.9 -p 2 –min_mac 3 (with 2 population popmap i.e. N vs South to better follow the Vaux et al datset used for outlier detection) Coverage: individual GT set to missing if depth per individual per SNP was less than 10, then applied a missingness filter again (max missingness at a locus (across all samples) 20%), then excluded coverage outliers


#make directory for results:
# /dfs/Omalley_Lab/dayan/albacore/feature_selection/stacks

#pop map, because we do not have a priori assumption about the relationship between sampling locations and population structure & some sample sites are small, we should not do per "population filtering" therefore used a popmap and -p value with just 1 big population

# maf, because Ne is big and we are using one big pop, rare variants may be very importnat, do not do maf filtering, but we still want to get rid of singletons so will use the min-mac filter instead

# -r 0.9



SGE_Batch -q chinook -P 30 -f 30G -m 36G -c '/local/cluster/stacks/stacks-2.41/bin/ref_map.pl -T 30 -o /dfs/Omalley_Lab/dayan/albacore/feature_selection/stacks --popmap /nfs1/FW_HMSC/OMalley_Lab/vauxf/STACKS/population_maps/albacore/all-bam1.txt --samples /nfs1/FW_HMSC/OMalley_Lab/vauxf/BWA/albacore/samfiles/ori-all -X "populations: -r 0.9 -p 12 --min_mac 3 --write_single_snp --vcf --fasta_loci"' -r ori-pac

#this produced a lot of missing data, we'll try with a different -p (and pop map) this will drastically reduce the number of SNPs, but should ameliorate the high variance in missingness across pops

SGE_Batch -q harold -P 30 -f 30G -m 36G -c '/local/cluster/stacks/stacks-2.41/bin/populations -t 30 -P /dfs/Omalley_Lab/dayan/albacore/feature_selection/stacks -M /nfs1/FW_HMSC/OMalley_Lab/vauxf/STACKS/population_maps/albacore/12-sites-bam4.txt  -r 0.9 -p 12 --min_mac 3 --write_single_snp --vcf ' -r ori-pac-12

#another populations run with n-s popmap
SGE_Batch -q harold -P 30 -f 30G -m 36G -c '/local/cluster/stacks/stacks-2.41/bin/populations -t 30 -P /dfs/Omalley_Lab/dayan/albacore/feature_selection/stacks -M n_s_popmap.txt  -r 0.9 -p 2 --min_mac 3 --write_single_snp --vcf --fasta_loci' -r ori-pac-2

let’s take a look at the coverage across sampling sites

# selected the bamstats section out of stacks log and wrote to file

bamstats <- read_tsv("input_data/bamstats.txt")

ggplot(bamstats, aes(x=factor(pop), y=mean_cov_ns)) + stat_summary(fun.y="mean", geom="bar")+theme_classic()+geom_hline(aes(yintercept = 20, color = "red"))

BC and BA have such low coverage most data will be set to missing. When we do the PCA we’ll impute the mean allele freq and destroy a lot of potential structure.

#low confidence SNPs
vcftools --vcf populations.snps.vcf --minDP 10 --min-meanDP 20 --recode-INFO-all --recode --out 0.2

#run 3 results: After filtering, kept 27483 out of a possible 27773 Sites
#iterative missingness

#bad inds 1 (really bad inds >50% missingness)
vcftools  --vcf 0.2.recode.vcf  --missing-indv
awk '$5 > 0.5 && NR>1 {print $1}' out.imiss > bad_inds1
#run 3: removed 19 bad inds

#bad snps 1 (after removing the bad inds, remove really bad gts (50% missingnes))
vcftools  --vcf 0.2.recode.vcf  --remove bad_inds1 --max-missing 0.5 --recode-INFO-all --recode --out 0.3
# run 3 results: 27469 out of a possible 27483 Sites


#bad inds 2 (less bad individuals > 30% missingness)
vcftools  --vcf 0.3.recode.vcf  --missing-indv
awk '$5 > 0.3 && NR>1 {print $1}' out.imiss > bad_inds2

#bad snps2 (after second round of bad inds removed get rid of moderately bad gts 90%)
vcftools  --vcf 0.3.recode.vcf  --remove bad_inds2 --max-missing 0.9 --recode-INFO-all --recode --out 0.4
#run 3 results: 286 out of 313 Individuals, 22277 out of a possible 27469 Sites


#run 2 results
#After filtering, kept 312 out of 328 Individuals
#Outputting VCF file...
#After filtering, kept 12519 out of a possible 13572 Sites

#next we grab ldepths to filter the high coverage outliers

vcftools  --vcf 0.4.recode.vcf --site-mean-depth
depths <- read_tsv("input_data/out.ldepth_3.mean")
ggplot(data=depths)+geom_histogram(aes(x = MEAN_DEPTH))

#grab outliers as those 2.5X modal dpeth
getMode <- function(x) {
keys <- unique(x)
keys[which.max(tabulate(match(x, keys)))]
}


ggplot()+geom_density(data=depths, aes(MEAN_DEPTH)) +geom_vline(aes(xintercept=(2.5*getMode(depths$MEAN_DEPTH))), color = "red") +geom_vline(aes(xintercept=(getMode(depths$MEAN_DEPTH))), color = "blue") 

tmp <- depths[depths$MEAN_DEPTH > 2.5*getMode(depths$MEAN_DEPTH),c(1,2)]
bad_sites <- tmp %>%
  select(CHROM, POS)
write_tsv(bad_sites, "input_data/bad_sites.txt", col_names = FALSE, quote_escape = FALSE)

#89 bad sites (run 2)
# 294 bad sites (run 3)
vcftools --vcf 0.4.recode.vcf --exclude-positions bad_sites.txt --recode-INFO-all --recode --out 0.5

NExt we get rid of potential paralogoues using paralogue finder

#paralogue finder runs on old oython (2.7), so created a conda env and installed all relevant modules there
conda create --name paralog_finder python=2.7
source activate paralog_finder
conda install pandas scipy numpy statsmodels #install dependencies

#run paralog finder 
#hdplot stats (run on snps straight from stacks due to vcftools format issues)
python ~/Science/programs/paralog-finder/HDplot_process_vcf.py -i populations_3.snps.vcf

#plot hdplot figures
Rscript ~/Science/programs/paralog-finder/HDplot_graphs.R -i dayan2.depthsBias --minD -50 --maxD 50

#took a look at plots, looked excellent (very few potential paralogs), but continued anyway
#got black list using default values (read ratio deviation (D) = +- 7)
python ~/Science/programs/paralog-finder/blacklist_paralogs.py -i dayan2.depthsBias

#this collected 96 potential PSVs
# paralog finder outputs SNPs by catalogue number, which breaks when using write_single_snps and also is a format only understood by stacks so we have to do a little work to avoid running populations again

cut -f 4 dayan2.depthsBias | cut -d : -f 1 > catalog_numbers
paste catalog_numbers dayan2.depthsBias > depth_bias2
awk 'FNR==NR { a[$1]; next } ($1 in a){print $3,$4}' dayan2_paralogs.blacklist depth_bias2 > psv_blacklist.txt

vcftools --vcf 0.5_run3.recode.vcf --exclude-positions psv_blacklist.txt --recode-INFO-all --recode --out 1.0_run3

#run 2
#After filtering, kept 312 out of 312 Individuals
#Outputting VCF file...
#After filtering, kept 12354 out of a possible 12430 Sites

#run 3
#After filtering, kept 286 out of 286 Individuals
#After filtering, kept 21929 out of a possible 21983 Sites

Let’s do some quick QC checks on this

How many individuals are left per population

vcf <- read.vcfR("input_data/1.0_run3.recode.vcf")
genind <- vcfR2genind(vcf)

#now lets get the population information encoded
genind$pop <- as.factor(str_sub(row.names(genind$tab), 1, 2))

genind$pop <- as.factor(str_sub(colnames(vcf@gt)[2:length(colnames(vcf@gt))], start = 1, end = 2))

pops <- as.data.frame(as.character(genind$pop))
colnames(pops) <- "pop"
pops %>%
  group_by(pop) %>%
  count()

At this point BA (Baja CA) only has 2 individuals and BC only 3. Let’s cut them before doing any actual analyses

require(whoa)
#vcf <- read.vcfR("input_data/1.0_run3.recode.vcf")

gfreqs <- exp_and_obs_geno_freqs(vcf)

# then plot those.  Set max_plot_loci so that all 2000
# loci will be plotted
geno_freqs_scatter(gfreqs, max_plot_loci = 2000)

overall <- infer_m(vcf, minBin = 1e15 )
overall$m_posteriors

Mean het miscall rate for this dataset is 5.3%.

Missing data?

dp <- extract.gt(vcf, element = "DP", as.numeric=TRUE)

myMiss <- apply(dp, MARGIN = 2, function(x){ sum(is.na(x)) })
myMiss <- myMiss/nrow(vcf)

hist(myMiss)

duplicates: there are a handful of individuals with duplicate sample names that wind up as strong outliers in the PCA let’s remove these and see if they are driving the PCA structure

colnames(vcf@gt) <- str_sub(colnames(vcf@gt), start = 1, end = 7)
duplicated(colnames(vcf@gt))
vcf3 <- vcf[, !duplicated(colnames(vcf@gt))]

Finally, lets save this vcf.

write.vcf(vcf3,"input_data/vcf3.vcf.gz")

Lets look at the unfolded SFS (i .e. maf distribution)

mafs <- maf(vcf3)
ggplot()+geom_density(aes(x = mafs[,4]))

sum(mafs[,4]> 0.05)
sum(mafs[,4]< 0.01)

vcf3 <- vcf3[mafs[,4]>0.01,]

Of the 22k snps, about 9k are “rare” (< 1% MAF) and 5k are common (5% MAF)

Filtering history, run 1 raw_stacks_output - dayan.snps.vcf:28654 low coverage: dayan_snps_0.2.recode.vcf:22913 high_coverage: 0.3.recode.vcf:22829 paralog filtered: 1.0.recode.vcf:22603

run 2:

run 3:

Input Data

Here we read in genetic data

First for the Vaux SNP dataset

#remove duplicates
colnames(vcf_vaux@gt) <- str_sub(colnames(vcf_vaux@gt), start = 1, end = 7)
#duplicated(colnames(vcf_vaux@gt))
vcf_vaux <- vcf_vaux[, !duplicated(colnames(vcf_vaux@gt))]


genind_vaux <- vcfR2genind(vcf_vaux)

#now lets get the population information encoded
#exclude BA because so much missing data that after imputation for missing data (to prep for PCA) it strongly weighs down analysis (on extreme of most MEMs, but imputation makes it very close to mean allele frequency)
genind_vaux$pop <- as.factor(str_sub(row.names(genind_vaux$tab), 1, 2))
genind_vaux <- genind_vaux[pop(genind_vaux) != "BA"]

Now for the other data (not evaluated in final notebook) here are many version of the SNP dataset. For a first pass RDA we’re going to to use one without PSV or LD filtering.

#vcf_vaux <- read.vcfR("input_data/populations.snps.vcf") # 12 site
#vcf_vaux <- read.vcfR("input_data/vaux_data/ori-pac-1-p2-r95-a3.vcf") # 2 site

#vcf <- read.vcfR("input_data/dayan2.snps.vcf")


#filter SNPs to 80% inds (only tolerate up to 20% missing data at a locus)
#dp <- extract.gt(vcf, element = "DP", as.numeric=TRUE)
#myMiss <- apply(dp, MARGIN = 1, function(x){ sum( is.na(x) ) } )
#myMiss <- myMiss / ncol(dp)
#vcf <- vcf[myMiss < 0.2, ]

#vcf3 is the version with duplicates removed and dayan snp calling
#vcf <- read.vcfR("input_data/vcf3.vcf.gz")
genind <- vcfR2genind(vcf3)

#now lets get the population information encoded
genind$pop <- as.factor(str_sub(row.names(genind$tab), 1, 2))
genind <- genind[pop(genind) != "BA"]
genind <- genind[pop(genind) != "BC"]

#convert to genpop format 
#genpop <- genind2genpop(genind)

#lets collect just the north pacific
genind_north <- genind[pop(genind) != "TS" & pop(genind) != "NC"]


#make fake duplicates to see how it will affect analysis
#genind_north_sample <- genind_north[sample(1:nrow(genind_north@tab), size = 20) ]

#genind_north <- repool(genind_north_sample, genind_north)

k-means and DAPC

As a first pass, we’ll attempt to cluster the data using k-means? Then, visualize differences among genetic clusters using DAPC. This shouldn’t work well given k-menas power with such subtle structure, but it’s low hanging fruit. Below we look for any populations that tend to cluster by k-means

kmeans <- find.clusters(genind_north, n.pca = 246, n.clust = 2)
kable(table(pop(genind_north), kmeans$grp), caption = "a priori population vs genetic cluster" )

kmeans <- find.clusters(genind_north, n.pca = 246, n.clust = 3)
kable(table(pop(genind_north), kmeans$grp), caption = "a priori population vs genetic cluster" )

kmeans <- find.clusters(genind_north, n.pca = 246, n.clust = 4)
kable(table(pop(genind_north), kmeans$grp), caption = "a priori population vs genetic cluster" )

kmeans <- find.clusters(genind_north, n.pca = 246, n.clust = 5)
kable(table(pop(genind_north), kmeans$grp), caption = "a priori population vs genetic cluster" )

Nope, it appears the k-means just continously splits clusters across sampling locations, without any clusters that correspond to clear spatial structures among sampling locations.

Spatial Analyses

Methods Summary

  1. cRDA: Determine if we can find a decomposition of spatial structure that significantly explains genetic structure in the dataset OTHER THAN the North/South spatial structure successfully identified in the Vaux manuscript. In this approach we will take extra care to avoid overfitting and fit the spatial variables on only the most important axes of genetic variation.

  2. RDA outlier detection: If the cRDA demonstrates that there are spatial structures that explain patterns of genetic variation, we will fit a second RDA on the full SNP dataset (not PCA transformed). Then we will use these results to identify which SNPs carry the strongest spatial structure signals. In other words, this is how we use spatially explicit multivariate analyses as a tool for feature selection.

Spatial Variables

Here we generate the spatial variables from the raw location data. We take two approaches. In the first, we use what is most commonly applied to when working with population genetic data, distance based moran’s eigenvectors. In the second we fully explore a set of graphs (spatial neighborhoods) connecting sampling connections to build the spatial weighting matrix that best captures genetic variance. The latter may be more powerful if spatial structures other than IBD based autocorrelation drives stucture.

We also generate the spatial variables for both the full dataset, and a north pacific only dataset.

*note: the sampling design meant that the spatial neighborhoods we built under different approaches alwys just recapitulated the dbMEMs. so we just used dbMEMs as these are the most commonly used MEMs in landscape/seascape genetics.

dbMEMs

The spatial variable is a distance based moran’s eigenvector map (dbMEM). dbMEM is used to decompose the spatial structure into a set of orthoganal eigenfunctions thereby providing better resolution of autocorrelation or local structures.

# First let's get the rough lat and lons for the dataset
# just manually made a table from Vaux based on sampling map (no location information was provided in ms or supplemental materials)

sites <- read_tsv("input_data/rough_locations.txt")

#now let's focus just on NPAC samples
Nsites <- sites[c(1:10),]

# now lets get the individual level dataset
sites_big <- data.frame(pop = genind_vaux$pop)
sites_big <- sites_big %>%
  left_join(sites, by = c("pop" = "Pop"))
Nsites_big <- sites_big %>%
  filter(pop != "NC" & pop != "TS")

#first convert all lat lon to geodetic cartesian coords

#anti-meridian causes an issue, let's fix it
sites_big<- sites_big %>%
  mutate(Longitude = if_else(Longitude < 0, Longitude + 360, Longitude))
Nsites_big<- Nsites_big %>%
  mutate(Longitude = if_else(Longitude < 0, Longitude + 360, Longitude))

#now calc geodetic coords
require(SoDA)
## Warning: package 'SoDA' was built under R version 3.6.2
XY_big <- geoXY(sites_big[,3], sites_big[,4], unit =1000)
NXY_big  <- geoXY(Nsites_big[,3], Nsites_big[,4], unit =1000)

#add jitter
XY_big[,1] <- jitter(XY_big[,1])
XY_big[,2]<- jitter(XY_big[,2])

NXY_big[,1] <- jitter(NXY_big[,1])
NXY_big[,2]<- jitter(NXY_big[,2])


#now calculate dbMEMs (spatial autocorrelation)
dbmems <- dbmem(XY_big, MEM.autocor = "all")
Ndbmems <- dbmem(NXY_big, MEM.autocor = "all")

Now that dbmems are made let’s take a look at them.

# first eigenvalues/Moran's
barplot(attr(dbmems, "values"), 
        main = "Eigenvalues of the spatial weighting matrix\nProportional to Moran's I\nFull Dataset", cex.main = 0.7)

barplot(attr(Ndbmems, "values"), 
        main = "Eigenvalues of the spatial weighting matrix\nProportional to Moran's I\nNorth Pacific Only Dataset", cex.main = 0.7)

For the full dataset only 4 dbMEMs have a positive I (i.e. spatial autocorrelation), while the NPac has 2. Next let’s test these to see if they are worth keeping (i.e. if moran’s I is greater than 0 with a permutation test). Then we’ll plot them in space and see what we have.

#first recompute dbmems keeping only positive to speed up rand test (no need to test ones we're not going to use)

dbmems <- dbmem(XY_big, store.listw = TRUE)
Ndbmems <- dbmem(NXY_big, store.listw = TRUE)

test <- moran.randtest(dbmems, nrepet = 99)
test
## class: krandtest lightkrandtest 
## Monte-Carlo tests
## Call: moran.randtest(x = dbmems, nrepet = 99)
## 
## Number of tests:   4 
## 
## Adjustment method for multiple comparisons:   none 
## Permutation number:   99 
##   Test         Obs    Std.Obs   Alter Pvalue
## 1 MEM1 0.815717205 175.163555 greater   0.01
## 2 MEM2 0.441406174  91.886176 greater   0.01
## 3 MEM3 0.108847059  30.131378 greater   0.01
## 4 MEM4 0.004807834   1.329097 greater   0.09
test <- moran.randtest(Ndbmems, nrepet = 99)
test
## class: krandtest lightkrandtest 
## Monte-Carlo tests
## Call: moran.randtest(x = Ndbmems, nrepet = 99)
## 
## Number of tests:   2 
## 
## Adjustment method for multiple comparisons:   none 
## Permutation number:   99 
##   Test        Obs    Std.Obs   Alter Pvalue
## 1 MEM1 0.52595991 241.537844 greater   0.01
## 2 MEM2 0.01055556   5.351842 greater   0.01

If we use an uncorrected p < 0.1 cutoff, the full dataset has 3 significant dbMEMs and the NPac has 2.

Now let’s plot these in space to see what they look like. First the set of MEMs for the North Pacific

# we'll use marmap tomake some nice looking plots of the pacific and color sampling locations by there dbMEM values

# North Pacific First
#first get the marmap data
require(marmap)
Npac_bathy <- getNOAA.bathy(lon1 = 130, lon2 = -116, lat1 = 11, lat2 = 53, resolution = 10, antimeridian = TRUE)

#now get the mem and sampling sites together
Nmem_plot <- as.data.frame(cbind(Ndbmems, Nsites_big))
#fix the antimeridian problem
Nmem_plot<- Nmem_plot %>%
  mutate(Longitude = if_else(Longitude < 0, Longitude + 360, Longitude))

#NPAC

#mem1
autoplot(Npac_bathy, geom=c("r")) + scale_fill_gradientn(values = scales::rescale(c(-6600, -10, 1, 2200)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+geom_point(aes(x = Longitude, y = Latitude, color = MEM1), data = Nmem_plot, size = 3)+theme(legend.position = "none")+scale_color_viridis_c(option="A")+ggtitle("NPAC MEM1")

#mem2
autoplot(Npac_bathy, geom=c("r")) + scale_fill_gradientn(values = scales::rescale(c(-6600, -10, 1, 2200)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+geom_point(aes(x = Longitude, y = Latitude, color = MEM2), data = Nmem_plot, size = 3)+theme(legend.position = "none")+scale_color_viridis_c(option="A")+ggtitle("NPAC MEM2")

Next the set of MEMs for the Full Pacific.

pac_bathy <- getNOAA.bathy(lon1 = 130, lon2 = -116, lat1 = -50, lat2 = 53, resolution = 10, antimeridian = TRUE)

#now get the mem and sampling sites together
mem_plot <- as.data.frame(cbind(dbmems, sites_big))
#fix the antimeridian problem
mem_plot<- mem_plot %>%
  mutate(Longitude = if_else(Longitude < 0, Longitude + 360, Longitude))

#NPAC

#mem1
autoplot(pac_bathy, geom=c("r")) + scale_fill_gradientn(values = scales::rescale(c(-6600, -10, 1, 2400)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+geom_point(aes(x = Longitude, y = Latitude, color = MEM1), data = mem_plot, size = 3)+theme(legend.position = "none")+scale_color_viridis_c(option="A")+ggtitle("PAC MEM1")

#mem2
autoplot(pac_bathy, geom=c("r")) + scale_fill_gradientn(values = scales::rescale(c(-6600, -10, 1, 2400)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+geom_point(aes(x = Longitude, y = Latitude, color = MEM2), data = mem_plot, size = 3)+theme(legend.position = "none")+scale_color_viridis_c(option="A")+ggtitle("PAC MEM2")

#mem3
autoplot(pac_bathy, geom=c("r")) + scale_fill_gradientn(values = scales::rescale(c(-6600, -10, 1, 2400)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+geom_point(aes(x = Longitude, y = Latitude, color = MEM3), data = mem_plot, size = 3)+theme(legend.position = "none")+scale_color_viridis_c(option="A")+ggtitle("PAC MEM3")

#mem4
autoplot(pac_bathy, geom=c("r")) + scale_fill_gradientn(values = scales::rescale(c(-6600, -10, 1, 2400)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+geom_point(aes(x = Longitude, y = Latitude, color = MEM4), data = mem_plot, size = 3)+theme(legend.position = "none")+scale_color_viridis_c(option="A")+ggtitle("PAC MEM4")

cRDA (redundancy analysis of components)

Goals:
For RDAs, the primary goal is to determine if we can find a decomposotion of spatial structure that significantly explains genetic structure in the dataset OTHER THAN the North/South structure successfully identified in the Vaux manuscript. If we identify structure the variable loadings from the RDAs can also be used for initial feature selection.

Note on dataset Here we run the first pass, fastest possible analysis using moran’s eigenvector maps. We collapse genotypic data among individuals into principal components of genetic variation (thus cRDA). Note that larger analyses such as RDA on individual genotypes may have greater power and allow loadings on individual loci, but may be more prone to overfitting. If this faster, harder to overfit analysis discovers population substructure, than we can be encouraged to continue to these locus level analyses

Vaux full rda

the filtering approaches between the dataset used here and the vaux ms are pretty different, let’s run the RDA, but with the vaux dataset.

Genetic Data

First prep the data: impute missing data, conduct PCA

#first replace missing data with mean frequency
X <- tab(genind_vaux, freq = TRUE, NA.method = "mean")

#then run pca
pca1 <- dudi.pca(X, scale = FALSE, scannf = FALSE, nf = 324)

#check pcs to keep with kaiser-guttman

#kaiser guttman
cutoff<-mean(pca1$eig)
kg <- length((pca1$eig)[(pca1$eig)>cutoff])
barplot(pca1$eig, main = "PCA eigenvalues")
abline(h = cutoff, col = "red")

#kept all PCs
snp_pcs <- pca1$li#[,c(1:kg)]

#now plot data
pca_plot_data <- as.data.frame(cbind(genind_vaux$pop, snp_pcs))
pca_plot_data <- pca_plot_data %>%
  rename(pop = "genind_vaux$pop")
ggplot(data = pca_plot_data)+geom_point(aes(Axis1, Axis2, color = pop)) + stat_ellipse(aes(Axis1, Axis2, color = pop)) +theme_classic()

ggplot(data = pca_plot_data)+geom_point(aes(Axis1, Axis3, color = pop)) + stat_ellipse(aes(Axis1, Axis3, color = pop)) +theme_classic()

RDA

Now we fit the RDA and test for global significance

Full model is significant (p = 0.001). Proceed to variable selection. Here we conduct forward varaible selection on an AIC like value (ordistep), p-value and R2 (ordir2step), and finally permutation of residuals (forward.sel).

## [1] 0.004046806
## Step: R2.adj= 0 
## Call: snp_pcs ~ 1 
##  
##                   R2.adjusted
## <All variables>  4.046806e-03
## + MEM1           2.861572e-03
## + MEM2           1.034443e-03
## + MEM3           1.298045e-04
## <none>           0.000000e+00
## + MEM4          -1.921325e-05
## 
##        Df    AIC      F Pr(>F)   
## + MEM1  1 2186.8 1.8695  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.002861572 
## Call: snp_pcs ~ MEM1 
##  
##                 R2.adjusted
## <All variables> 0.004046806
## + MEM2          0.003908958
## + MEM3          0.003001315
## <none>          0.002861572
## + MEM4          0.002851802
## 
##        Df    AIC      F Pr(>F)   
## + MEM2  1 2187.4 1.3176  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.003908958 
## Call: snp_pcs ~ MEM1 + MEM2 
##  
##                 R2.adjusted
## + MEM3          0.004052658
## <All variables> 0.004046806
## <none>          0.003908958
## + MEM4          0.003902647
## 
## Start: snp_pcs ~ 1 
## 
##        Df    AIC      F Pr(>F)   
## + MEM1  1 2186.8 1.8695  0.005 **
## + MEM2  1 2187.3 1.3138  0.005 **
## + MEM3  1 2187.6 1.0393  0.020 * 
## + MEM4  1 2187.6 0.9942  0.635   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: snp_pcs ~ MEM1 
## 
##        Df    AIC      F Pr(>F)   
## - MEM1  1 2186.6 1.8695  0.005 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        Df    AIC      F Pr(>F)   
## + MEM2  1 2187.4 1.3176  0.005 **
## + MEM3  1 2187.7 1.0423  0.010 **
## + MEM4  1 2187.8 0.9970  0.565   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: snp_pcs ~ MEM1 + MEM2 
## 
##        Df    AIC      F Pr(>F)   
## - MEM2  1 2186.8 1.3176  0.005 **
## - MEM1  1 2187.3 1.8715  0.005 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        Df    AIC      F Pr(>F)   
## + MEM3  1 2188.4 1.0434   0.01 **
## + MEM4  1 2188.4 0.9981   0.52   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: snp_pcs ~ MEM1 + MEM2 + MEM3 
## 
##        Df    AIC      F Pr(>F)   
## - MEM3  1 2187.4 1.0434  0.010 **
## - MEM2  1 2187.7 1.3177  0.005 **
## - MEM1  1 2188.3 1.8718  0.005 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        Df    AIC      F Pr(>F)
## + MEM4  1 2189.4 0.9982  0.615
## Testing variable 1
## Testing variable 2
## Testing variable 3
## Procedure stopped (adjR2thresh criteria) adjR2cum = 0.004053 with 3 variables (> 0.004047)
## MEM1 MEM2 MEM3 MEM4 
##    1    1    1    1

Here one method chooses all three mems, while the other two just keep mem1 and mem2. Not too concerned with overfitting for this analysis, so lets be less conservative and keep all three

#final model
#rda_final <- rda(snp_pcs ~ MEM3 + MEM2 + MEM7 + MEM9 + MEM6 , data = mems)
rda_final <- rda(snp_pcs ~ MEM1 + MEM2 + MEM3, data = dbmems)

#testing signficance of constrained axes (how many are interesting)
rda_axis <- anova.cca(rda_final, by = 'axis', parallel = 3)
rda_axis$p.adj <- p.adjust (rda_axis$`Pr(>F)`, method = 'holm')
rda_axis
#testing significance of explanatory variables by margin (each conditioned on all the others)
rda_expl <-  anova.cca(rda_final, by = 'margin', parallel = 3)
rda_expl$p.adj <- p.adjust (rda_expl$`Pr(>F)`, method = 'holm')
rda_expl$pve<- rda_expl$Variance/sum(rda_expl$Variance)

rda_expl

First two RDs and all MEMs significant adjusted p-value < 0.05.

Now lets plot these

## tri plot
#site score
rda_sum<-summary(rda_final, axes = c(2))
scores <- as.data.frame(rda_sum$sites)
scores$pop<-genind_vaux$pop


#species scores
arrows<-as.data.frame(rda_sum$biplot)

#plot by pop
ggplot()+geom_point(aes(RDA1, RDA2, color = pop), data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.3, y=RDA2+RDA2*0.3, label=c("MEM1", "MEM2","MEM3")), size = 4,  color="black")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n~0.7%% of Total Variance\n adjusted p-value 0.012")+ylab("Redundant Axis 2\n0.35% of Total Variance\n adjusted p-value 0.002")+scale_color_manual(values = c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a", "#ffff99", "#b15928"))

Summary

We fit a global model of all variables. This full model was significant. We then performed forward variable selection. Three approaches yielded different results and we built the final model on MEMs 1, and 2, 3

Only the first two RDA axes were significant (i.e. explain significantly more than the equivalent unconstrained axes, Holms adjusted p value < 0.05). All explanatory variables (MEMs) in the final model are also significant (i.e. significantly greater explanation of variance for the variable, once the data is conditioned on all other variables, Holms adjusted p value < 0.05)

Collectively these suggest that there is a significant relationship between genetic variation and spatial autocorrelation (although note this model did not attempt to parse population structure from autocorrelation per se). The first two constrained axes captured ~ 1% of the total variance in the PCA.

When we examine the spatial structures in the RDA triplot we can see that MEMs 1 and 2 load strongly onto the first redundant axis, and this axis separates North from South pacific populations. MEM3 loads mostly onto the second RDA and captures some spatail autocorrelatio within the NPACsamples

vaux SNP RDA

here we run the SNP based RDA to pull outlier SNPs for the two RDA axes we know are signficant.

Here I take what I’ve learned from the cRDA to find spatial outliers SNPs that are candidates for a GTseq panel

First we collect the SNPs for spatial structure alone

The first step is fitting the RDAs and testing all the variables

snp_rda_null <- rda(X ~ 1, data = dbmems)
snp_rda_full <- rda(X ~ . , data = dbmems)


#check that the full model is significant
anova(snp_rda_full) # 0.002 yes it is significant - proceed to variable selection
#what the variance explained
adjR2.rda_full <- RsquareAdj(snp_rda_full)$adj.r.squared
adjR2.rda_full
## [1] 0.004046806
#variable selection
# here we do variable selection with three different forward selection approaches
snp_ordiR2 <- ordiR2step(snp_rda_null, scope = formula(snp_rda_full, data = dbmems))
## Step: R2.adj= 0 
## Call: X ~ 1 
##  
##                   R2.adjusted
## <All variables>  4.046806e-03
## + MEM1           2.861572e-03
## + MEM2           1.034443e-03
## + MEM3           1.298045e-04
## <none>           0.000000e+00
## + MEM4          -1.921325e-05
## 
##        Df    AIC      F Pr(>F)   
## + MEM1  1 2186.8 1.8695  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.002861572 
## Call: X ~ MEM1 
##  
##                 R2.adjusted
## <All variables> 0.004046806
## + MEM2          0.003908958
## + MEM3          0.003001315
## <none>          0.002861572
## + MEM4          0.002851802
## 
##        Df    AIC      F Pr(>F)   
## + MEM2  1 2187.4 1.3176  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.003908958 
## Call: X ~ MEM1 + MEM2 
##  
##                 R2.adjusted
## + MEM3          0.004052658
## <All variables> 0.004046806
## <none>          0.003908958
## + MEM4          0.003902647
snp_ordiR2$anova  
snpordi <- ordistep(snp_rda_null, scope = formula(snp_rda_full, data = dbmems)) 
## 
## Start: X ~ 1 
## 
##        Df    AIC      F Pr(>F)   
## + MEM1  1 2186.8 1.8695  0.005 **
## + MEM2  1 2187.3 1.3138  0.005 **
## + MEM3  1 2187.6 1.0393  0.010 **
## + MEM4  1 2187.6 0.9942  0.650   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: X ~ MEM1 
## 
##        Df    AIC      F Pr(>F)   
## - MEM1  1 2186.6 1.8695  0.005 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        Df    AIC      F Pr(>F)   
## + MEM2  1 2187.4 1.3176  0.005 **
## + MEM3  1 2187.7 1.0423  0.005 **
## + MEM4  1 2187.8 0.9970  0.480   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: X ~ MEM1 + MEM2 
## 
##        Df    AIC      F Pr(>F)   
## - MEM2  1 2186.8 1.3176  0.005 **
## - MEM1  1 2187.3 1.8715  0.005 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        Df    AIC      F Pr(>F)   
## + MEM3  1 2188.4 1.0434   0.01 **
## + MEM4  1 2188.4 0.9981   0.56   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: X ~ MEM1 + MEM2 + MEM3 
## 
##        Df    AIC      F Pr(>F)   
## - MEM3  1 2187.4 1.0434  0.010 **
## - MEM2  1 2187.7 1.3177  0.005 **
## - MEM1  1 2188.3 1.8718  0.005 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        Df    AIC      F Pr(>F)
## + MEM4  1 2189.4 0.9982   0.59
snpordi$anova
#rda.fs <- forward.sel(Y = snp_pcs, X = dbmems, adjR2thresh = adjR2.rda_full)
#rda.fs 
snp_rda_final <- rda(X ~ MEM1 + MEM2 + MEM3, data = dbmems)

#testing signficance of constrained axes (how many are interesting)
rda_axis <- anova.cca(snp_rda_final, by = 'axis', parallel = 3)
rda_axis$p.adj <- p.adjust (rda_axis$`Pr(>F)`, method = 'holm')
rda_axis
#testing significance of explanatory variables by margin (each conditioned on all the others)
rda_expl <-  anova.cca(snp_rda_final, by = 'margin', parallel = 3)
rda_expl$p.adj <- p.adjust (rda_expl$`Pr(>F)`, method = 'holm')
rda_expl$pve<- rda_expl$Variance/sum(rda_expl$Variance)

rda_expl
## tri plot
#site score
rda_sum<-summary(snp_rda_final, axes = c(2))
scores <- as.data.frame(rda_sum$sites)
scores$pop<-genind_vaux$pop


#species scores
arrows<-as.data.frame(rda_sum$biplot)

#plot by pop
ggplot()+geom_point(aes(RDA1, RDA2, color = pop), data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.3, y=RDA2+RDA2*0.3, label=c("MEM1", "MEM2", "MEM3")), size = 4,  color="black")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n1.5% of Total Variance")+ylab("Redundant Axis 2\n0.48% of Total Variance")+scale_color_discrete()#+scale_color_viridis_d(option = "A")

Now that the RDAs are fit let’s pull out the SNPs that load onto the RDs.

snp_loading <- as.data.frame(scores(snp_rda_final, display = "species"))

a <- ggplot(data = snp_loading)+geom_histogram(aes(x =scale(RDA1)))+theme_classic()+ggtitle(paste("Full Pacific RDA 1\n",sum(abs(scale(snp_loading$RDA1)) > 3)/2, sep = "" ))+xlab("Z-scores of SNP loadings")+geom_vline(aes(xintercept = 3), color = "red")+geom_vline(aes(xintercept = -3), color = "red")

b <- ggplot(data = snp_loading)+geom_histogram(aes(x =scale(RDA2)))+theme_classic()+ggtitle(paste("Full Pacific RDA 2\n",sum(abs(scale(snp_loading$RDA2)) > 3)/2, sep = "" ))+xlab("Z-scores of SNP loadings")+geom_vline(aes(xintercept = 3), color = "red")+geom_vline(aes(xintercept = -3), color = "red")


cowplot::plot_grid(a,b)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#count the number of outliers 
sum(abs(scale(snp_loading$RDA1)) > 3)/2
## [1] 68
sum(abs(scale(snp_loading$RDA2)) > 3)/2
## [1] 104
axis2_snps <- snp_loading[abs(scale(snp_loading$RDA2)) > 3,]
axis2_snps <- axis2_snps %>%
  rownames_to_column(var="SNP") %>%
  separate(SNP, sep = ":", into = c("catalog_id", "catalog_snp_pos", "strand")) %>%
  group_by(catalog_id) %>%
  slice(1)

axis1_snps <- snp_loading[abs(scale(snp_loading$RDA1)) > 3,]
axis1_snps <- axis1_snps %>%
  rownames_to_column(var="SNP") %>%
  separate(SNP, sep = ":", into = c("catalog_id", "catalog_snp_pos", "strand")) %>%
  group_by(catalog_id) %>%
  slice(1)


#write_tsv(axis2_snps, "panel_info/vaux_snp_rda_outliers_axis2.txt")
#write_tsv(axis1_snps, "panel_info/vaux_snp_rda_outliers_axis1.txt")

Are SNPs that fall into the tails of these distributions rare or common?

n_s_snps <- row.names(snp_loading[which(abs(scale(snp_loading$RDA1)) > 3,),])
n_s_snps <- str_sub(n_s_snps, 1, nchar(n_s_snps)-2)
n_s_snps <- genind_vaux[loc = n_s_snps]

mafs_n_s <- minorAllele(n_s_snps)
mafs_all <- minorAllele(genind_vaux)

x <- as.data.frame(cbind(mafs_n_s, rep("n_s", length(mafs_n_s))))
y <- as.data.frame(cbind(mafs_all, rep("all", length(mafs_all))))
colnames(x) <- c("maf", "set")
colnames(y) <- c("maf", "set")
y$maf <- as.numeric(levels(y$maf))[y$maf]
x$maf <- as.numeric(levels(x$maf))[x$maf]


maf_comp <- as.data.frame(rbind(x,y)) 

ggplot(data=maf_comp)+geom_density(aes(x = maf, color= set))+xlim(0, 0.5)

Common alleles are vastly over-represented among those that load strongly onto the axis separating N from S pacific samples.

What about taxis 2 snps

n_s_snps <- row.names(snp_loading[which(abs(scale(snp_loading$RDA2)) > 3,),])
n_s_snps <- str_sub(n_s_snps, 1, nchar(n_s_snps)-2)
n_s_snps <- genind_vaux[loc = n_s_snps]

mafs_n_s <- minorAllele(n_s_snps)
mafs_all <- minorAllele(genind_vaux)

x <- as.data.frame(cbind(mafs_n_s, rep("n_s", length(mafs_n_s))))
y <- as.data.frame(cbind(mafs_all, rep("all", length(mafs_all))))
colnames(x) <- c("maf", "set")
colnames(y) <- c("maf", "set")
y$maf <- as.numeric(levels(y$maf))[y$maf]
x$maf <- as.numeric(levels(x$maf))[x$maf]


maf_comp <- as.data.frame(rbind(x,y)) 

ggplot(data=maf_comp)+geom_density(aes(x = maf, color= set))+xlim(0, 0.5)

same. this suggests either this inference method is biased towards common SNPs (makes sense) or structure at this scale lies more with common than rare SNPs. in either case it suggests that “neutral” SNPs might most useful if they are biased towards common SNPs rather than evenly representing the SFS.

Genetic Data

To fit the cRDA we need to get the PCs of genetic variation for each individual and the dbmems into a dataframe

First the PCs for the full dataset

#first replace missing data with mean frequency
X <- tab(genind, freq = TRUE, NA.method = "mean")

#then run pca
pca1 <- dudi.pca(X, scale = FALSE, scannf = FALSE, nf = 324)

#check pcs to keep with kaiser-guttman

#kaiser guttman
cutoff<-mean(pca1$eig)
kg <- length((pca1$eig)[(pca1$eig)>cutoff])
barplot(pca1$eig, main = "PCA eigenvalues")
abline(h = cutoff, col = "red")

#kept 149 PCs that exceed kaiserguttman, nope keep em all
snp_pcs <- pca1$li#[,c(1:kg)]

Using the kaiser-guttman criteria, we kept the first 129 PCs in the full SNP dataset.

Now Let’s take a look at these populations in PC space.

#now plot data
pca_plot_data <- as.data.frame(cbind(genind$pop, snp_pcs))
pca_plot_data <- pca_plot_data %>%
  rename(pop = "genind$pop")
ggplot(data = pca_plot_data)+geom_point(aes(Axis1, Axis2, color = pop)) + stat_ellipse(aes(Axis1, Axis2, color = pop)) +theme_classic()

ggplot(data = pca_plot_data)+geom_point(aes(Axis1, Axis3, color = pop)) + stat_ellipse(aes(Axis1, Axis3, color = pop)) +theme_classic()

No surpises here, same results as Vaux ms. Fst is too low for PCA to discriminate among pops within a gyre, but it does cluster S Pac samples apart from NPac and there are some samples with admixture between these two clusters. The only difference is that there are some strong outliers among the NC samples, potentially these are due to some of the filtering parameters being different. (check later)

Now lets do the same, just for the north pacific samples

##### now just the north samples

#first replace missing data with mean frequency
Xn <- tab(genind_north, freq = TRUE, NA.method = "mean")

#then run pca
pca_north <- dudi.pca(Xn, scale = FALSE, scannf = FALSE, nf = 245)

#check pcs to keep with kaiser-guttman

#kaiser guttman
cutoff<-mean(pca_north$eig)
kg <- length((pca_north$eig)[(pca_north$eig)>cutoff])
barplot(pca_north$eig, main = "PCA eigenvalues")
abline(h = cutoff, col = "red")

#kept 117 PCs that exceed kaiserguttman
#Nsnp_pcs <- pca_north$li[,c(1:kg)]

#nope keep em all
Nsnp_pcs <- pca_north$li

#now plot data
Npca_plot_data <- as.data.frame(cbind(genind_north$pop, Nsnp_pcs))
Npca_plot_data <- Npca_plot_data %>%
  rename(pop = "genind_north$pop")
ggplot(data = Npca_plot_data)+geom_point(aes(Axis1, Axis2, color = pop)) + stat_ellipse(aes(Axis1, Axis2, color = pop)) +theme_classic()

ggplot(data = Npca_plot_data)+geom_point(aes(Axis3, Axis4, color = pop)) + stat_ellipse(aes(Axis3, Axis4, color = pop)) +theme_classic()

Again, no surprises. No evidence of structure within North Pacific when using PCA.

Full Dataset cRDA

Now that we have gathered all of our variables, we’ll fit the cRDA using the full dataset

Notes:
Had trouble understanding why observation of structure in North Pacific was so highly dependent on the SNP dataset used. so I ran the analysis on many separate dataset until I realized that positive results depended on there being near duplicates in PC space. it looks like Vaux used some sequencing replicates that were filtered out in the final manuscript but not in the datasets I used. To confirm that this is how I was observing structure, I ran stacks and did my own filtering, removed potential replicates and then did two more runs for the north pacific: 1 with the true dataset, a second with 30 duplicated individuals. If found no significant structure in N pacific (i.e. p-value of permutation test of full RDA model using Kaiser-guttman cutoff PCs was not significant), but adding duplicate samples led to a highly signficant test and observation of structure at whatever spatial structure I tested.

when we use the deduplicated dataset on the full pacific dataset using my filtering, the full rda is signficant, but the axis that captures variation among N pacific sampling sites is not significant.

first reset the spatial variables for this dataset

# First let's get the rough lat and lons for the dataset
# just manually made a table from Vaux based on sampling map (no location information was provided in ms or supplemental materials)

sites <- read_tsv("input_data/rough_locations.txt")

#now let's focus just on NPAC samples
Nsites <- sites[c(1:10),]

# now lets get the individual level dataset
sites_big <- data.frame(pop = genind$pop)
sites_big <- sites_big %>%
  left_join(sites, by = c("pop" = "Pop"))
Nsites_big <- sites_big %>%
  filter(pop != "NC" & pop != "TS")

#first convert all lat lon to geodetic cartesian coords

#anti-meridian causes an issue, let's fix it
sites_big<- sites_big %>%
  mutate(Longitude = if_else(Longitude < 0, Longitude + 360, Longitude))
Nsites_big<- Nsites_big %>%
  mutate(Longitude = if_else(Longitude < 0, Longitude + 360, Longitude))

#now calc geodetic coords
require(SoDA)
XY_big <- geoXY(sites_big[,3], sites_big[,4], unit =1000)
NXY_big  <- geoXY(Nsites_big[,3], Nsites_big[,4], unit =1000)

#add jitter
XY_big[,1] <- jitter(XY_big[,1])
XY_big[,2]<- jitter(XY_big[,2])

NXY_big[,1] <- jitter(NXY_big[,1])
NXY_big[,2]<- jitter(NXY_big[,2])


#now calculate dbMEMs (spatial autocorrelation)
dbmems <- dbmem(XY_big, MEM.autocor = "all")
Ndbmems <- dbmem(NXY_big, MEM.autocor = "all")

Now that dbmems are made let’s take a look at them.

# first eigenvalues/Moran's
barplot(attr(dbmems, "values"), 
        main = "Eigenvalues of the spatial weighting matrix\nProportional to Moran's I\nFull Dataset", cex.main = 0.7)

barplot(attr(Ndbmems, "values"), 
        main = "Eigenvalues of the spatial weighting matrix\nProportional to Moran's I\nNorth Pacific Only Dataset", cex.main = 0.7)

For the full dataset only 4 dbMEMs have a positive I (i.e. spatial autocorrelation), while the NPac has 2. Next let’s test these to see if they are worth keeping (i.e. if moran’s I is greater than 0 with a permutation test). Then we’ll plot them in space and see what we have.

#first recompute dbmems keeping only positive to speed up rand test (no need to test ones we're not going to use)

dbmems <- dbmem(XY_big, store.listw = TRUE)
Ndbmems <- dbmem(NXY_big, store.listw = TRUE)

test <- moran.randtest(dbmems, nrepet = 99)
test

test <- moran.randtest(Ndbmems, nrepet = 99)
test

Global Test and Variable Selection

All variable selection procedures agree on keeping MEM1, forward selection based on p-value along (ordistep) keeps MEM2 as well

Final Model

Now that we have established that the full model is signifcant and done variable selection to identify the spatial structures, let’s build the final model.

#final model
#rda_final <- rda(snp_pcs ~ MEM3 + MEM2 + MEM7 + MEM9 + MEM6 , data = mems)
rda_final <- rda(snp_pcs ~ MEM1 + MEM2 , data = dbmems)

#testing signficance of constrained axes (how many are interesting)
rda_axis <- anova.cca(rda_final, by = 'axis', parallel = 3)
rda_axis$p.adj <- p.adjust (rda_axis$`Pr(>F)`, method = 'holm')
rda_axis

#testing significance of explanatory variables by margin (each conditioned on all the others)
rda_expl <-  anova.cca(rda_final, by = 'margin', parallel = 3)
rda_expl$p.adj <- p.adjust (rda_expl$`Pr(>F)`, method = 'holm')
rda_expl$pve<- rda_expl$Variance/sum(rda_expl$Variance)

rda_expl

Permutation analyses suggest that the only the first redundant axis significantly correlate patterns in the spatial dataset with patterns in the genetic dataset. Separate permutations also show that the selected MEMs significantly explain the PCs.

## screeplot

constrained_percent <- rda_final$CCA$eig/rda_final$tot.chi*100
unconstrained_percent <- rda_final$CA$eig/rda_final$tot.chi*100
barplot (c(constrained_percent, unconstrained_percent), col = c(rep ('red', length (constrained_percent)), rep ('black', length (unconstrained_percent))), las = 2, ylab = '% variation')
Screeplot of RDA - red is constrained axes, black is unconstrained axes

Screeplot of RDA - red is constrained axes, black is unconstrained axes

Full Pacific cRDA Results Summary

## tri plot
#site score
rda_sum<-summary(rda_final, axes = c(2))
scores <- as.data.frame(rda_sum$sites)
scores$pop<-genind$pop


#species scores
arrows<-as.data.frame(rda_sum$biplot)

#plot by pop
ggplot()+geom_point(aes(RDA1, RDA2, color = pop), data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.3, y=RDA2+RDA2*0.3, label=c("MEM1", "MEM2")), size = 4,  color="black")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n0.86% of Total Variance\np-value 0.001")+ylab("Redundant Axis 2\n0.38% of Total Variance\np-value 0.17")+scale_color_manual(values = c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a", "#ffff99", "#b15928"))

We fit a global model of all variables. This full model was significant. We then performed forward variable selection. Three approaches yielded the same result and we built the final model on MEMs 1, and 2

Only the first RDA axis is significant (i.e. explain significantly more than the equivalent unconstrained axes, Holms adjusted p value < 0.05). All explanatory variables (MEMs) in the final model are also significant (i.e. significantly greater explanation of variance for the variable, once the data is conditioned on all other variables, Holms adjusted p value < 0.05)

Collectively these suggest that there is a significant relationship between genetic variation and spatial autocorrelation (although note this model did not attempt to parse population structure from autocorrelation). The first two constrained axes captured ~ 1.8% of the total variance in the model. Since only the first XXX PCs of genetic variation were included, we can not directly relate this back to FST, but a back of the napkin calculation would suggest we fit about half to total variance in the dataset. This would bring these results in close alignment with the Fst attributed to North vs South population genetic structure.

When we examine the spatial structures in the RDA triplot we can see that while MEMs 1 and 2 load strongly onto the first redundant axis, and this axis separates North from South pacific populations. This is not surprising given the results from the Vaux MS.

XXX remnant from keeping mem4 XXX However, the second redundant axis provides very interesting results that suggest significant spatial structures that “are redundant with” genetic variance within the North Pacific. Specifically redundant axis 2 is driven almost exclusively by MEM4 and this MEM (while distance based and mostly separating Hawaii from all other samples), also maps strongly onto a N-S gradient within the west coast N Pac samples. Indeed when we examine the tri-plot we can observe clinal variation along the West Coast samples. This pattern (RDA2 separates N-S pops and Hawaii), may be due to (a) introgression tails from the southern hemisphere (as suggested by Vaux ms), (b) spatial autocorrelation produced by IBD within the North Pacific, or (c) an adaptive genetic cline. Parsing (c) from (a and b) may be possible with an environmental association analysis, but parsing (a) and (b) may be challenging without haplotypic data. Conducting the analysis on the North Pac only samples (below), may also clarify these patterns

North Pacific cRDA

Global Test and Variable Selection

#global rda
Nrda_null <- rda(Nsnp_pcs ~ 1, data = Ndbmems)
Nrda_full <- rda(Nsnp_pcs ~ ., data = Ndbmems)


#check that the full model is significant
anova(Nrda_full) # 0.006 yes it is significant - proceed to variable selection

#what the variance explained
NadjR2.rda_full <- RsquareAdj(Nrda_full)$adj.r.squared
NadjR2.rda_full


#variable selection
# here we do variable selection with three different forward selection approaches
NordiR2 <- ordiR2step(Nrda_null, scope = formula(Nrda_full, data = Ndbmems))
NordiR2$anova  
Nordi <- ordistep(Nrda_null, scope = formula(Nrda_full, data = Ndbmems)) 
Nordi$anova
Nrda.fs <- forward.sel(Y = Nsnp_pcs, X = Ndbmems, adjR2thresh = NadjR2.rda_full)
Nrda.fs 

Global is significant, variable selection keeps both MEMs

Final Model

Now that we have established that the full model is signifcant and done variable selection to identify the spatial structures, let’s build the final model.

#final model
Nrda_final <- rda(Nsnp_pcs ~ MEM2, data = Ndbmems)

#testing signficance of constrained axes (how many are interesting)
rda_axis <- anova.cca(Nrda_final, by = 'axis', parallel = 3)
rda_axis$p.adj <- p.adjust (rda_axis$`Pr(>F)`, method = 'holm')
rda_axis

#testing significance of explanatory variables by margin (each conditioned on all the others)
rda_expl <-  anova.cca(Nrda_final, by = 'margin', parallel = 3)
rda_expl$p.adj <- p.adjust (rda_expl$`Pr(>F)`, method = 'holm')
rda_expl$pve<- rda_expl$Variance/sum(rda_expl$Variance)

rda_expl

Keep 2 RDAs; all MEMs

North Pacific cRDA Results Summary

## tri plot
#site score
Nrda_sum<-summary(Nrda_final, axes = c(2))
Nscores <- as.data.frame(Nrda_sum$sites)
Nscores$pop<-genind_north$pop


#species scores
Narrows<-as.data.frame(Nrda_sum$biplot)

#plot by pop
#ggplot()+geom_point(aes(RDA1, RDA2, color = pop), data = Nscores)+geom_segment(data=Narrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=Narrows, aes(x=RDA1+RDA1*0.3, y=RDA2+RDA2*0.3, label=c("MEM1", "MEM2")), size = 4,  color="black")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = Nscores)+theme_classic()+xlab("Redundant Axis 1\n0.48% of Total Variance\np-value=0.18")+ylab("Redundant Axis 2\n0.44% of Total Variance\np-value=0.54")+scale_color_discrete()+ggtitle("NPAC RDA\nglobal p-value 0.31")#+scale_color_viridis_d(option = "A")

ggplot()+geom_point(aes(RDA1, PC1, color = pop), data = Nscores)+geom_segment(data=Narrows, aes(x=0, y=0, xend=RDA1, yend=PC1), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=Narrows, aes(x=RDA1+RDA1*0.3, y=PC1+PC1*0.3, label=c("MEM2")), size = 4,  color="black")+stat_ellipse(aes(RDA1, PC1, color = pop), data = Nscores)+theme_classic()+xlab("Redundant Axis 1\n0.48% of Total Variance\np-value=0.2")+ylab("PC axis 1")+scale_color_discrete()+ggtitle("NPAC RDA\nglobal p-value 0.19")#+scale_color_viridis_d(option = "A")

We fit a global model of all variables. This full model was significant. We then performed forward variable selection. Three approaches yielded the same result and we built the final model on MEMs 1, and 2.

Only the first 2 RDA axes are significant (i.e. explain significantly more than the equivalent unconstrained axes, Holms adjusted p value < 0.05). All explanatory variables (MEMs) in the final model are also significant (i.e. significantly greater explanation of variance for the variable, once the data is conditioned on all other variables, Holms adjusted p value < 0.05)

Collectively these suggest that there is a significant relationship between genetic variation and spatial autocorrelation at the tested scales. The first two constrained axes captured ~ 0.9% of the total variance in the model. Since only the first 117 PCs of genetic variation were included, we can not directly relate this back to FST.

When we examine the RDA triplot we can see that MEMs 1 and 2 load strongly onto different redundant axes. Three clusters of populations are apparent: eastern (West coast N America), central (Hawaii), and western (Japan + Phillipines). RDA1 (and MEM2) separates all three clusters of populations with western cluster intermediate, while RDA2 (and MEM1) clusters the eastern and central clusters together and spearates the western cluster. Taken together, it is possible to explain a small portion of the genetic variation in the north Pacifc as a result of spatial auotocorrelation among these three clusters, although there may be other demographic and selective scenarios that can produce similar patterns (see discussion below)

cRDA Discussion

Both cRDAs show that there are spatial structures at a variety scales that can explain variation in the genetic variation within the Pacific. However synthesizing the results from the North Pacific dataset and the full dataset raises some questions.

(1) Is there a N-S cline within the N Pac samples, if so why don’t we see it in the NPac cRDA?
Since our MEMs are dbMEMs, they are a spectral decomposition of the distances between populations. No dbMEMs in the North Pacific dataset captured this N-S gradient. I’m not sure why the finest scale saptial autocorrelation was not captured by a MEM with positive Moran’s I. Perhaps using a different spatial neighborhood to generate MEMs would capture this pattern, or we could just test for it directly by isolating these samples, or fitting for latitude.

In any case, the absence of the pattern in the North Pac cRDA is most easily explained by the fact that we did not test for it. An explicit test of latitude as an explanation for genetic variation is possible, and we can even parse this from spatial autocorrelation per se with yet another RDA or other canonical ordination approach.

(2) Is this overfit?

Maybe.

A lesson from population genomics on marine organisms is that we have so much data that we tend to find patterns whenever we look for them. While a lot of authors attribute this to efficiency of selection in large populations coupled with abundant environmental gradients, overfitting is also a concern. Several lines of evidence suggest these results should be robust to overfitting. (1) Only kept principal components of genetic variation that exceeded a criterion, didn’t fit on the full dataset and (2) the analysis should be resistant to overfitting in the traditional sense because n (samples) >> p (predictors: MEMs) and we performed variable selection on the tiny number of predictors we did use. However, there’s enough variation in the response variables (SNP PCA) that there is a possibility that the TRUE patterns in the dataset we are modeling are not true in nature (i.e. our procedures for avoiding overfitting are all focused on the explanatory variable side). Only validation with new data can confirm. Alternatively, we could attempt to conduct some type of cross-validation approach. This might be worthwhile before selecting SNPs.

(3) N-S Admixture Cline, N PAC E-W Clusters, Clustered Clines?

The biggest challenge going forward is that the RDAs have demonstrated that there are genetic patterns at a diversity of spatial scales and patterns. Therefore there is no clear choice for clustering that will allow us to group sampling sites into sub-populations for later genetic stock identification. This is a common problem in seascape genetics: we a trying to cluster a cline.

We might be able to parse the N-S cline However the most powerful MEMs are always the ones that are capturing N-S structure. Therefore it seems likely that what’s actually going on here is that Latitude is a better predictor of genetics than spatial autocorrelation. We can test this by explicitly including latitude and then doing variance paritioning / pRDA. Because the SWM seems to mostly capture E-W distance, this might actually have some power.

Latitude

lat_data <- cbind(dbmems, abs(sites_big$Latitude))
colnames(lat_data)[5] <- "lat"


lat_rda_null <- rda(snp_pcs ~ 1, data = lat_data)
lat_rda_full <- rda(snp_pcs ~ ., data = lat_data)

lat_ordiR2 <- ordiR2step(lat_rda_null, scope = formula(lat_rda_full, data = lat_data))
lat_ordiR2$anova  

# latitude not included in variable selection



lat_rda_final <- rda(snp_pcs ~ MEM1 + MEM2 + lat, data = lat_data)

vif.cca(lat_rda_final)


#testing signficance of constrained axes (how many are interesting)
rda_axis <- anova.cca(lat_rda_final, by = 'axis', parallel = 3)
rda_axis$p.adj <- p.adjust (rda_axis$`Pr(>F)`, method = 'holm')
rda_axis

#testing significance of explanatory variables by margin (each conditioned on all the others)
rda_expl <-  anova.cca(lat_rda_final, by = 'margin', parallel = 3)
rda_expl$p.adj <- p.adjust (rda_expl$`Pr(>F)`, method = 'holm')
rda_expl$pve<- rda_expl$Variance/sum(rda_expl$Variance)

rda_expl

#commented out below because lat not retained so already did this analysis

vp <- varpart(snp_pcs , lat_data[,c(1,2)] , lat_data[,5])
vp

rda_con_spatial<-rda(snp_pcs ~  lat + Condition(MEM1 + MEM2), data = lat_data)
anova(rda_con_spatial)

rda_sum<-summary(lat_rda_final, axes = c(2))
scores <- as.data.frame(rda_sum$sites)
scores$pop<-genind$pop
scores$lat <- lat_data$lat


#species scores
arrows<-as.data.frame(rda_sum$biplot)

#plot by lat
#ggplot()+geom_point(aes(RDA1, RDA2, color = lat), data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n1.5% of Total Variance")+ylab("Redundant Axis 2\n0.48% of Total Variance")+scale_color_viridis_c()


#plot by pop
ggplot()+geom_point(aes(RDA1, RDA2, color = pop), data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.3, y=RDA2+RDA2*0.3, label=c("MEM1", "MEM2", "Latitude")), size = 4,  color="black")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n1.5% of Total Variance")+ylab("Redundant Axis 2\n0.48% of Total Variance")+scale_color_discrete()#+scale_color_viridis_d(option = "A")
Nlat_data <- cbind(Ndbmems, abs(Nsites_big$Latitude))
colnames(Nlat_data)[3] <- "lat"


Nlat_rda_null <- rda(Nsnp_pcs ~ 1, data = Nlat_data)
Nlat_rda_full <- rda(Nsnp_pcs ~ ., data = Nlat_data)

anova(Nlat_rda_full)

Nlat_ordiR2 <- ordiR2step(Nlat_rda_null, scope = formula(Nlat_rda_full, data = Nlat_data, Pin = 0.5))
Nlat_ordiR2$anova  

Nlat_rda_final <- rda(Nsnp_pcs ~ MEM1 + MEM2+ lat, data = Nlat_data)

vif.cca(Nlat_rda_final)

rda_expl <-  anova.cca(Nlat_rda_final, by = 'margin', parallel = 3)
rda_expl$p.adj <- p.adjust (rda_expl$`Pr(>F)`, method = 'holm')
rda_expl$pve<- rda_expl$Variance/sum(rda_expl$Variance)

rda_expl

#testing signficance of constrained axes (how many are interesting)
rda_axis <- anova.cca(Nlat_rda_final, by = 'axis', parallel = 3)
rda_axis$p.adj <- p.adjust (rda_axis$`Pr(>F)`, method = 'holm')
rda_axis


vp <- varpart(Nsnp_pcs , Nlat_data[,c(1)] , Nlat_data[,c(3)])
vp

rda_con_spatial<-rda(Nsnp_pcs ~  lat + Condition(MEM1 + MEM2), data = Nlat_data)
anova(rda_con_spatial)


rda_sum<-summary(Nlat_rda_final, axes = c(2))
scores <- as.data.frame(rda_sum$sites)
scores$pop<-genind_north$pop
scores$lat <- Nlat_data$lat


#species scores
arrows<-as.data.frame(rda_sum$biplot)

#plot by lat
ggplot()+geom_point(aes(RDA1, RDA2, color = lat), data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.3, y=RDA2+RDA2*0.3, label=c("MEM1", "MEM2", "Latitude")), size = 4,  color="black")+theme_classic()+xlab("Redundant Axis 1\n1.5% of Total Variance")+ylab("Redundant Axis 2\n0.48% of Total Variance")+scale_color_viridis_c()



#plot by pop
ggplot()+geom_point(aes(RDA1, RDA2, color = pop), data = scores, alpha = 0.8)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.3, y=RDA2+RDA2*0.3, label=c("MEM1", "MEM2", "Latitude")), size = 4,  color="black")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n0.48% of Total Variance\npvalue = 0.139 ")+ylab("Redundant Axis 2\n0.46% of Total Variance\np-value = 0.52")#+scale_color_manual(values = c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a", "#ffff99", "#b15928"))
Nlat_data <- cbind(Ndbmems, abs(Nsites_big$Latitude))
colnames(Nlat_data)[3] <- "lat"


Nlat_rda_null <- rda(Nsnp_pcs ~ 1, data = Nlat_data)
Nlat_rda_full <- rda(Nsnp_pcs ~ ., data = Nlat_data)

Nlat_ordiR2 <- ordiR2step(Nlat_rda_null, scope = formula(Nlat_rda_full, data = Nlat_data))
Nlat_ordiR2$anova  

ordi <- ordistep(Nlat_rda_null, scope = formula(Nlat_rda_full, data = Ndbmems)) 
ordi$anova
rda.fs <- forward.sel(Y = Xn, X = Ndbmems, adjR2thresh = adjR2.rda_full)
rda.fs


Nlat_rda_final <- rda(Nsnp_pcs ~ lat , data = Nlat_data)

vif.cca(lat_rda_final)

rda_expl <-  anova.cca(Nlat_rda_final, by = 'margin', parallel = 3)
rda_expl$p.adj <- p.adjust (rda_expl$`Pr(>F)`, method = 'holm')
rda_expl$pve<- rda_expl$Variance/sum(rda_expl$Variance)


vp <- varpart(snp_pcs , lat_data[,c(1,2,4)] , lat_data[,c(5)])

rda_con_spatial<-rda(snp_pcs ~  lat + Condition(MEM1 + MEM2 + MEM4), data = lat_data)

rda_sum<-summary(Nlat_rda_final, axes = c(2))
scores <- as.data.frame(rda_sum$sites)
scores$pop<-genind_north$pop
scores$lat <- Nlat_data$lat


#species scores
arrows<-as.data.frame(rda_sum$biplot)

#plot by lat
ggplot()+geom_point(aes(RDA1, PC1, color = lat), data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+stat_ellipse(aes(RDA1, RDA2, color = lat), data = scores)+theme_classic()+xlab("Redundant Axis 1\n1.5% of Total Variance")+ylab("Redundant Axis 2\n0.48% of Total Variance")+scale_color_viridis_c()



#plot by pop
ggplot()+geom_point(aes(RDA1, PC1, color = pop), data = scores, alpha = 0.8)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=PC1), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.3, y=PC1+PC1*0.3, label=c("Lat")), size = 4,  color="black")+stat_ellipse(aes(RDA1, PC1, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n1.5% of Total Variance")+ylab("Redundant Axis 2\n0.48% of Total Variance")+scale_color_manual(values = c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a", "#ffff99", "#b15928"))

RDA outlier analysis

Here I take what I’ve learned from the cRDAs to find spatial outliers SNPs that are candidates for a GTseq panel

First we collect the SNPs for spatial structure alone

The first step is fitting the RDAs and testing all the variables

snp_rda_null <- rda(X ~ 1, data = dbmems)
snp_rda_full <- rda(X ~ . , data = dbmems)


#check that the full model is significant
anova(snp_rda_full) # 0.002 yes it is significant - proceed to variable selection

#what the variance explained
adjR2.rda_full <- RsquareAdj(snp_rda_full)$adj.r.squared
adjR2.rda_full


#variable selection
# here we do variable selection with three different forward selection approaches
snp_ordiR2 <- ordiR2step(snp_rda_null, scope = formula(snp_rda_full, data = dbmems))
snp_ordiR2$anova  
snpordi <- ordistep(snp_rda_null, scope = formula(snp_rda_full, data = dbmems)) 
snpordi$anova
#rda.fs <- forward.sel(Y = snp_pcs, X = dbmems, adjR2thresh = adjR2.rda_full)
#rda.fs 
snp_rda_final <- rda(X ~ MEM1 + MEM2, data = dbmems)

#testing signficance of constrained axes (how many are interesting)
rda_axis <- anova.cca(snp_rda_final, by = 'axis', parallel = 3)
rda_axis$p.adj <- p.adjust (rda_axis$`Pr(>F)`, method = 'holm')
rda_axis

#testing significance of explanatory variables by margin (each conditioned on all the others)
rda_expl <-  anova.cca(snp_rda_final, by = 'margin', parallel = 3)
rda_expl$p.adj <- p.adjust (rda_expl$`Pr(>F)`, method = 'holm')
rda_expl$pve<- rda_expl$Variance/sum(rda_expl$Variance)

rda_expl

## tri plot
#site score
rda_sum<-summary(snp_rda_final, axes = c(2))
scores <- as.data.frame(rda_sum$sites)
scores$pop<-genind$pop


#species scores
arrows<-as.data.frame(rda_sum$biplot)

#plot by pop
ggplot()+geom_point(aes(RDA1, RDA2, color = pop), data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.3, y=RDA2+RDA2*0.3, label=c("MEM1", "MEM2")), size = 4,  color="black")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n1.5% of Total Variance")+ylab("Redundant Axis 2\n0.48% of Total Variance")+scale_color_discrete()#+scale_color_viridis_d(option = "A")
#north snp rda

Nsnp_rda_null <- rda(Xn ~ 1, data = Ndbmems)
Nsnp_rda_full <- rda(Xn ~ . , data =Ndbmems)


#check that the full model is significant
anova(Nsnp_rda_full) # 0.002 yes it is significant - proceed to variable selection

#what the variance explained
adjR2.rda_full <- RsquareAdj(Nsnp_rda_full)$adj.r.squared
adjR2.rda_full


#variable selection
# here we do variable selection with three different forward selection approaches
Nsnp_ordiR2 <- ordiR2step(Nsnp_rda_null, scope = formula(Nsnp_rda_full, data = Ndbmems))
Nsnp_ordiR2$anova  
Nordi <- ordistep(Nsnp_rda_null, scope = formula(Nsnp_rda_full, data = Ndbmems)) 
Nordi$anova
#rda.fs <- forward.sel(Y = Xn, X = Ndbmems, adjR2thresh = adjR2.rda_full)
#rda.fs 

vif.cca(rda_full) # don't need to do this because mems should be orthagonal

Nsnp_rda_final <- rda(Xn ~ MEM1 + MEM2, data = Ndbmems)

## tri plot
#site score
rda_sum<-summary(Nsnp_rda_final, axes = c(2))
scores <- as.data.frame(rda_sum$sites)
scores$pop<-genind_north$pop


#species scores
arrows<-as.data.frame(rda_sum$biplot)

#plot by pop
ggplot()+geom_point(aes(RDA1, RDA2, color = pop), data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.3, y=RDA2+RDA2*0.3, label=c("MEM1", "MEM2")), size = 4,  color="black")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n1.5% of Total Variance")+ylab("Redundant Axis 2\n0.48% of Total Variance")+scale_color_discrete()#+scale_color_viridis_d(option = "A")

north pac SNP rda: not significant

Now that the RDAs are fit let’s pull out the SNPs that load onto the RDs.

snp_loading <- as.data.frame(scores(snp_rda_final, display = "species"))

n_snp_loading <- as.data.frame(scores(Nsnp_rda_final, display = "species"))

a <- ggplot(data = snp_loading)+geom_histogram(aes(x =scale(RDA1)))+theme_classic()+ggtitle(paste("Full Pacific RDA 1\n",sum(abs(scale(snp_loading$RDA1)) > 3)/2, sep = "" ))+xlab("Z-scores of SNP loadings")+geom_vline(aes(xintercept = 3), color = "red")+geom_vline(aes(xintercept = -3), color = "red")

b <- ggplot(data = snp_loading)+geom_histogram(aes(x =scale(RDA2)))+theme_classic()+ggtitle(paste("Full Pacific RDA 2\n",sum(abs(scale(snp_loading$RDA2)) > 3)/2, sep = "" ))+xlab("Z-scores of SNP loadings")+geom_vline(aes(xintercept = 3), color = "red")+geom_vline(aes(xintercept = -3), color = "red")

c <- ggplot(data = n_snp_loading)+geom_histogram(aes(x =scale(RDA1)))+theme_classic()+ggtitle(paste("North Pacific RDA 1\n",sum(abs(scale(n_snp_loading$RDA1)) > 3)/2, sep = "" ))+xlab("Z-scores of SNP loadings")+geom_vline(aes(xintercept = 3), color = "red")+geom_vline(aes(xintercept = -3), color = "red")

d <- ggplot(data = n_snp_loading)+geom_histogram(aes(x =scale(RDA2)))+theme_classic()+ggtitle(paste("North Pacific RDA 1\n",sum(abs(scale(n_snp_loading$RDA2)) > 3)/2, sep = "" ))+xlab("Z-scores of SNP loadings")+geom_vline(aes(xintercept = 3), color = "red")+geom_vline(aes(xintercept = -3), color = "red")

cowplot::plot_grid(a,b,c,d)

#count the number of outliers 
sum(abs(scale(snp_loading$RDA1)) > 3)/2
sum(abs(scale(snp_loading$RDA2)) > 3)/2
sum(abs(scale(n_snp_loading$RDA1)) > 3)/2
sum(abs(scale(n_snp_loading$RDA2)) > 3)/2

Are SNPs that fall into the tails of these distributions rare or common?

n_s_snps <- row.names(snp_loading[which(abs(scale(snp_loading$RDA1)) > 3,),])
n_s_snps <- str_sub(n_s_snps, 1, nchar(n_s_snps)-2)
n_s_snps <- genind[loc = n_s_snps]

mafs_n_s <- minorAllele(n_s_snps)
mafs_all <- minorAllele(genind)

x <- as.data.frame(cbind(mafs_n_s, rep("n_s", length(mafs_n_s))))
y <- as.data.frame(cbind(mafs_all, rep("all", length(mafs_all))))
colnames(x) <- c("maf", "set")
colnames(y) <- c("maf", "set")
y$maf <- as.numeric(levels(y$maf))[y$maf]
x$maf <- as.numeric(levels(x$maf))[x$maf]


maf_comp <- as.data.frame(rbind(x,y)) 

ggplot(data=maf_comp)+geom_density(aes(x = maf, color= set))+xlim(0, 0.5)

Common alleles are vastly over-represented among those that load strongly onto the axis separating N from S pacific samples.

What about the non-significant rda axes from north pac

n_s_snps <- row.names(snp_loading[which(abs(scale(n_snp_loading$RDA1)) > 3,),])
n_s_snps <- str_sub(n_s_snps, 1, nchar(n_s_snps)-2)
n_s_snps <- genind_north[loc = n_s_snps]

mafs_n_s <- minorAllele(n_s_snps)
mafs_all <- minorAllele(genind_north)

x <- as.data.frame(cbind(mafs_n_s, rep("n_s", length(mafs_n_s))))
y <- as.data.frame(cbind(mafs_all, rep("all", length(mafs_all))))
colnames(x) <- c("maf", "set")
colnames(y) <- c("maf", "set")
y$maf <- as.numeric(levels(y$maf))[y$maf]
x$maf <- as.numeric(levels(x$maf))[x$maf]


maf_comp <- as.data.frame(rbind(x,y)) 

ggplot(data=maf_comp)+geom_density(aes(x = maf, color= set))+xlim(0, 0.5)
priv <- poppr::private_alleles(genind)
rowSums(priv)

Vaux north rda

#lets collect just the north pacific
genind_vaux_north <- genind_vaux[pop(genind_vaux) != "TS" & pop(genind_vaux) != "NC"]


#first replace missing data with mean frequency
Xn <- tab(genind_vaux_north, freq = TRUE, NA.method = "mean")

#then run pca
pca_north <- dudi.pca(Xn, scale = FALSE, scannf = FALSE, nf = 245)

#check pcs to keep with kaiser-guttman

#kaiser guttman
cutoff<-mean(pca_north$eig)
kg <- length((pca_north$eig)[(pca_north$eig)>cutoff])
barplot(pca_north$eig, main = "PCA eigenvalues")
abline(h = cutoff, col = "red")

#kept 117 PCs that exceed kaiserguttman
#Nsnp_pcs <- pca_north$li[,c(1:kg)]

#nope keep em all
Nsnp_pcs <- pca_north$li[,c(1:kg)]

#now plot data
Npca_plot_data <- as.data.frame(cbind(genind_vaux_north$pop, Nsnp_pcs))
Npca_plot_data <- Npca_plot_data %>%
  rename(pop = "genind_vaux_north$pop")
ggplot(data = Npca_plot_data)+geom_point(aes(Axis1, Axis2, color = pop)) + stat_ellipse(aes(Axis1, Axis2, color = pop)) +theme_classic()

ggplot(data = Npca_plot_data)+geom_point(aes(Axis3, Axis4, color = pop)) + stat_ellipse(aes(Axis3, Axis4, color = pop)) +theme_classic()


Ndbmems <- dbmem(NXY_big, store.listw = TRUE)

test <- moran.randtest(Ndbmems, nrepet = 99)
test

env variable

Nsites <- sites[c(1:10),]

# now lets get the individual level dataset
Nsites_big <- sites_big %>%
  filter(pop != "NC" & pop != "TS")

#first convert all lat lon to geodetic cartesian coords

#anti-meridian causes an issue, let's fix it
Nsites_big<- Nsites_big %>%
  mutate(Longitude = if_else(Longitude < 0, Longitude + 360, Longitude))

#now calc geodetic coords
require(SoDA)
NXY_big  <- geoXY(Nsites_big[,3], Nsites_big[,4], unit =1000)

NXY_big[,1] <- jitter(NXY_big[,1])
NXY_big[,2]<- jitter(NXY_big[,2])


#now calculate dbMEMs (spatial autocorrelation)
Ndbmems <- dbmem(NXY_big, MEM.autocor = "all")
Ndbmems <- dbmem(NXY_big, MEM.autocor = "positive")

now test and fit the RDA

#global rda
Nrda_null <- rda(Nsnp_pcs ~ 1, data = Ndbmems)
Nrda_full <- rda(Nsnp_pcs ~ ., data = Ndbmems)


#check that the full model is significant
anova(Nrda_full) # 0.006 yes it is significant - proceed to variable selection

#what the variance explained
NadjR2.rda_full <- RsquareAdj(Nrda_full)$adj.r.squared
NadjR2.rda_full


#variable selection
# here we do variable selection with three different forward selection approaches
NordiR2 <- ordiR2step(Nrda_null, scope = formula(Nrda_full, data = Ndbmems))
NordiR2$anova  
Nordi <- ordistep(Nrda_null, scope = formula(Nrda_full, data = Ndbmems)) 
Nordi$anova
Nrda.fs <- forward.sel(Y = Nsnp_pcs, X = Ndbmems, adjR2thresh = NadjR2.rda_full)
Nrda.fs 

North only global model not significant…

vaux latitude RDA

#first replace missing data with mean frequency
X <- tab(genind_vaux, freq = TRUE, NA.method = "mean")

#then run pca
pca1 <- dudi.pca(X, scale = FALSE, scannf = FALSE, nf = 324)

#check pcs to keep with kaiser-guttman

#kaiser guttman
cutoff<-mean(pca1$eig)
kg <- length((pca1$eig)[(pca1$eig)>cutoff])
barplot(pca1$eig, main = "PCA eigenvalues")
abline(h = cutoff, col = "red")

#kept all PCs
snp_pcs <- pca1$li[,c(1:kg)]

lat_data <- cbind(dbmems, abs(sites_big$Latitude))
colnames(lat_data)[5] <- "lat"


lat_rda_null <- rda(snp_pcs ~ 1, data = lat_data)
lat_rda_full <- rda(snp_pcs ~ ., data = lat_data)

lat_ordiR2 <- ordiR2step(lat_rda_null, scope = formula(lat_rda_full, data = lat_data))
lat_ordiR2$anova  

lat_ordi <- ordistep(lat_rda_null, scope = formula(lat_rda_full, data = lat_data))
lat_ordi$anova  

vif.cca(lat_rda_full)

Latitude leads into variance inflation (makes sense as it correlates with some of the MEMs), and causes MEM3 and lat to be insignificant. No need to do the latitude RDA since variable selection suggests we shouldn’t include it.

lat_rda_final <- rda(snp_pcs ~ MEM1 + MEM2 + lat, data = lat_data)

vif.cca(lat_rda_final)


#testing signficance of constrained axes (how many are interesting)
rda_axis <- anova.cca(lat_rda_final, by = 'axis', parallel = 3)
rda_axis$p.adj <- p.adjust (rda_axis$`Pr(>F)`, method = 'holm')
rda_axis

#testing significance of explanatory variables by margin (each conditioned on all the others)
rda_expl <-  anova.cca(lat_rda_final, by = 'margin', parallel = 3)
rda_expl$p.adj <- p.adjust (rda_expl$`Pr(>F)`, method = 'holm')
rda_expl$pve<- rda_expl$Variance/sum(rda_expl$Variance)

rda_expl

#commented out below because lat not retained so already did this analysis

vp <- varpart(snp_pcs , lat_data[,c(1,2)] , lat_data[,5])
vp

rda_con_spatial<-rda(snp_pcs ~  lat + Condition(MEM1 + MEM2), data = lat_data)
anova(rda_con_spatial)

rda_sum<-summary(lat_rda_final, axes = c(2))
scores <- as.data.frame(rda_sum$sites)
scores$pop<-genind_vaux$pop
scores$lat <- lat_data$lat


#species scores
arrows<-as.data.frame(rda_sum$biplot)

#plot by lat
#ggplot()+geom_point(aes(RDA1, RDA2, color = lat), data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n1.5% of Total Variance")+ylab("Redundant Axis 2\n0.48% of Total Variance")+scale_color_viridis_c()


#plot by pop
ggplot()+geom_point(aes(RDA1, RDA2, color = pop), data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.3, y=RDA2+RDA2*0.3, label=c("MEM1", "MEM2", "Latitude")), size = 4,  color="black")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n1.5% of Total Variance")+ylab("Redundant Axis 2\n0.48% of Total Variance")+scale_color_discrete()#+scale_color_viridis_d(option = "A")

adding latitude leads to insignificant rda2 (e shouldn’t include it anyway because it doesn’t make it out of variable selection)

Hierararchical Structure

Methods Summary

Treefit?

Statistical Learning

Methods Summary

dropped analyses

this section is a dump for code that we did not use

MEMs

The dbMEMs are distance based and represent the decomposition of distances, however, we know more than just distances and we might be interesting in spatial weighting matrices based on other spatial neighborhoods/graphs of connections among populations. In other words, there may be spatial structures other than simple spatial autocorrelation at different scales that we’d like to try to fit with an RDA, but we can’t test for these if our dbMEMs don’t capture them. So we’ll use some of the functionality in the adespatial package to choose a spatial weighting matrix to maximize r2

library(adespatial);library(sp);library(spdep)
nb <- chooseCN(coordinates(sitesmat), type = 1, plot.nb = TRUE)
lw <- nb2listw(nb, style = 'W', zero.policy = TRUE)

mems1 <- mem(lw)

barplot(attr(mems1, "values"))
ggplot()+geom_point(aes(x = sitesxy$Longitude, y = sitesxy$Latitude, color = mems1$MEM5), size = 3) +theme_classic()+scale_color_viridis_c(option = "A")


dbmems2 <- dbmem(sitesmat, store.listw = TRUE, silent = FALSE, MEM.autocor = "all", thresh = 100 )

barplot(attr(dbmems2, "values"))


#get sites matrix for full dataset
sites_big <- left_join(pca_plot_data, sites, by = c("pop" = "Pop"))[,c(249,248)]
#add jitter
sites_big$Longitude <- jitter(sites_big$Longitude)
sites_big$Latitude <- jitter(sites_big$Latitude)
sites_big <- as.matrix(sites_big)
#add jitter
sites_b

#build list of candidate spatial weighting matrices
cands <- listw.candidates(sites_big, nb = c("gab", "rel", "mst"), weights = c("bin", "flin"))

# choose best spatial weighting matrix
best.lw <- listw.select(snp_pcs[,c(1:117)], candidates = cands, nperm = 10, nperm.global = 99, MEM.all = TRUE, method = "global")
#rda with mems from delauney triangulation

mems_big <- left_join(pca_plot_data, as.data.frame(cbind(mems1, sites$Pop)), by = c("pop" = "sites$Pop"))
mems_big <- mems_big[,247:255]
#global rda
rda_null <- rda(snp_pcs ~ 1, data = mems_big)
rda_full <- rda(snp_pcs ~ . , data = mems_big)


#check that the full model is significant
anova(rda_full) # 0.002 yes it is significant - proceed to variable selection

#what the variance explained
adjR2.rda_full <- RsquareAdj(rda_full)$adj.r.squared
adjR2.rda_full


#variable selection
# here we do variable selection with three different forward selection approaches
ordiR2 <- ordiR2step(rda_null, scope = formula(rda_full, data = mems_big))
ordiR2$anova #MEMs 3,2,7,9 and 6 retained by forward selection 
ordi <- ordistep(rda_null, scope = formula(rda_full, data = mems_big)) 
ordi$anova#also adds mem 1
rda.fs <- forward.sel(Y = snp_pcs, X = mems_big, adjR2thresh = adjR2.rda_full)
rda.fs # same as ordiR2


#final model
#rda_final <- rda(snp_pcs ~ MEM3 + MEM2 + MEM7 + MEM9 + MEM6 , data = mems)
rda_final <- rda(snp_pcs ~ MEM1 + MEM8 + MEM7 + MEM2 + MEM6 + MEM3 + MEM9, data = mems_big)

#testing signficance of constrained axes (how many are interesting)
rda_axis <- anova.cca(rda_final, by = 'axis', parallel = 3)
rda_axis$p.adj <- p.adjust (rda_axis$`Pr(>F)`, method = 'holm')
rda_axis

#testing significance of explanatory variables by margin (each conditioned on all the others)
rda_expl <-  anova.cca(rda_final, by = 'margin', parallel = 3)
rda_expl$p.adj <- p.adjust (rda_expl$`Pr(>F)`, method = 'holm')
rda_expl$pve<- rda_expl$Variance/sum(rda_expl$Variance)

rda_expl


# just autocor

rda_pos <- rda(snp_pcs ~ MEM1 + MEM2 + MEM3, data = mems_big)
rda_null <- rda(snp_pcs ~ 1, data = mems_big)
ordiR2 <- ordiR2step(rda_null, scope = formula(rda_pos, data = mems_big))
ordiR2$anova
rda_pos <- rda(snp_pcs ~  MEM2 + MEM3, data = mems_big)


#testing signficance of constrained axes (how many are interesting)
rda_axis <- anova.cca(rda_pos, by = 'axis', parallel = 3)
rda_axis$p.adj <- p.adjust (rda_axis$`Pr(>F)`, method = 'holm')
rda_axis

#testing significance of explanatory variables by margin (each conditioned on all the others)
rda_expl <-  anova.cca(rda_pos, by = 'margin', parallel = 3)
rda_expl$p.adj <- p.adjust (rda_expl$`Pr(>F)`, method = 'holm')
rda_expl$pve<- rda_expl$Variance/sum(rda_expl$Variance)

rda_expl

## tri plot
#site score
rda_sum<-summary(rda_pos, axes = c(2))
scores <- as.data.frame(rda_sum$sites)
scores$pop<-cRDA_data$pop


#species scores
arrows<-as.data.frame(rda_sum$biplot)

#plot by pop
ggplot()+geom_point(aes(RDA1, RDA2, color = pop), data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="black")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.3, y=RDA2+RDA2*0.3, label=c( "MEM2","MEM3")), size = 4,  color="black")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n0.64% of Total Variance")+ylab("Redundant Axis 2\n0.61% of Total Variance")+scale_color_discrete()
#plot mem2 and mem3 on a map together

#get data
mem_plot <- as.data.frame(cbind(sites_big, mems_big))
#fix antimeridian issue
mem_plot<- mem_plot %>%
  mutate(Longitude = if_else(Longitude < 0, Longitude + 360, Longitude))

#first make plots separately 
# gp1 <- autoplot(pac_bathy, geom=c("r")) + scale_fill_gradientn(values = scales::rescale(c(-6600, -10, 1, 2200)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+geom_point(aes(x = Longitude, y = Latitude, color = MEM2), data = mem_plot, size = 3)+theme(legend.position = "none")
# gp2 <- autoplot(pac_bathy, geom=c("r")) + scale_fill_gradientn(values = scales::rescale(c(-6600, -10, 1, 2200)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+geom_point(aes(x = Longitude, y = Latitude, color = MEM2), data = mem_plot, size = 3)+theme(legend.position = "none")

mem_plot$color <- rgb(blue = scales::rescale(mem_plot$MEM2), red = scales::rescale(mem_plot$MEM3), green = 0)
autoplot(pac_bathy, geom=c("r")) + scale_fill_gradientn(values = scales::rescale(c(-6600, -10, 1, 2200)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+geom_point(aes(x = Longitude, y = Latitude, color = color), data = mem_plot, size = 3)+theme(legend.position = "none")+scale_color_identity()


autoplot(pac_bathy, geom=c("r")) + scale_fill_gradientn(values = scales::rescale(c(-6600, -10, 1, 2200)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+geom_point(aes(x = Longitude, y = Latitude, color = MEM2), data = mem_plot, size = 3)+theme(legend.position = "none")+scale_color_viridis_c(option = "A")


autoplot(pac_bathy, geom=c("r")) + scale_fill_gradientn(values = scales::rescale(c(-6600, -10, 1, 2200)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+geom_point(aes(x = Longitude, y = Latitude, color = MEM3), data = mem_plot, size = 3)+theme(legend.position = "none")+scale_color_viridis_c(option = "A")